Variant calling and filtering

Running Stacks
module load stacks/1.46
module load gcc/4.8.5  

denovo_map.pl --samples /archive/parchman_lab/rawdata_to_backup/GSAF_11_20_bc_parse/ACTH/ -o /working/mascaro/ACTH/ -T 5 -O /home/working/ACTH/pop_map -m 3 -M 2 -n 2 -S -b 1
populations -b 1 -P /working/mascaro/ACTH -t 12 -M /working/mascaro/ACTH/pop_map --max_obs_het 0.65 -r 0.80 --vcf
Calculate allele frequency
vcftools --gzvcf VCF --freq2 --out $OUT --max-alleles 2
Calculate mean depth per individual
vcftools --gzvcf VCF --depth --out $OUT
Calculate mean depth per site
vcftools --gzvcf VCF --site-mean-depth --out $OUT
Calculate proportion of missing data per individual
vcftools --gzvcf VCF --missing-indv --out $OUT
Calculate proportion of missing data per site
vcftools --gzvcf VCF --missing-site --out $OUT
Examining statistics in R
library(tidyverse)

##Mean depth
var_depth <- read_delim("filtered.depth.ldepth.mean", delim = "\t",
                        col_names = c("chr", "pos", "mean_depth", "var_depth"), skip =1)
a <- ggplot(var_depth, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_depth$mean_depth)
a + theme_light() + xlim(0, 100)

##Variant missingness
var_miss <- read_delim("missing_site.txt.lmiss", delim = "\t",
                       col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(var_miss, aes(fmiss)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_miss$fmiss)

##Minor alelle frecuencies
var_freq <- read_delim("frecuencies.txt.frq", delim = "\t",
                       col_names = c("chr", "pos", "nalleles", "nchr", "a1", "a2"), skip = 1)
var_freq$maf <- var_freq %>% select(a1, a2) %>% apply(1, function(z) min(z))
a <- ggplot(var_freq, aes(maf)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_freq$maf)

##Mean depth per individual
ind_depth <- read_delim("depth.txt.idepth", delim = "\t",
                        col_names = c("ind", "nsites", "depth"), skip = 1)
a <- ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()

##Proportion of missing data per individual
ind_miss  <- read_delim("missing_indiv.txt.imiss", delim = "\t",
                        col_names = c("ind", "ndata", "nfiltered", "nmiss", "fmiss"), skip = 1)
a <- ggplot(ind_miss, aes(fmiss)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
vcf filtering with vcftools
vcftools --vcf batch_sin_.vcf --maf 0.03 --max-missing 0.8 --minQ 10 --min-meanDP 5 --max-meanDP 16 --minDP 5 --maxDP 16 --recode --out ACTH_filtered.vcf

mawk '$5 > 0.5' out.imiss | cut -f1 > lowDP.indv

vcftools --vcf ACTH_filtered.vcf --remove lowDP.indv --recode --recode-INFO-all --out variants_clean

vcftools --vcf variants_clean.recode.vcf --out ACTH_filtered_final --remove-filtered-all --maf 0.03 --max-missing 0.8 --recode --thin 5
Final filtered vcf file
5677 number of loci 
261 individuals
mean depth per individuals 21.83

Genetic diversity estimates

He, Ho, Fis, Fst, Pi and TajimaD: Ho lower than He in all cases, positive Fis values (significant in all cases except for DH, EW, SC, SS and VM), positive TajimaD

library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
ACTH.VCF <- read.vcfR("ACTH_no_DC.vcf")
ACTH.VCF

##Fis, Fst
my_genind <- vcfR2genind(ACTH.VCF)
x<- my_genind 
pops <- as.factor(x$Pop)
ploidy(x) <- 2

#Population specific Fis:
myData.hfstat <- genind2hierfstat(my_genind, pop = pops)
basicstat <- basic.stats(myData.hfstat, diploid = TRUE, digits = 4) 
basicstat$Fis
write.csv(basicstat$Fis, "Fis.csv")

##Bootstrapping over loci of population's Fis
boot.ppfis(myData.hfstat)
#Nei's Pairwise FSTs: 
x <- genet.dist(myData.hfstat,diploid=TRUE,method="Ds")# Nei’s standard genetic distance

##Bootstrapping over loci of pairwise Fst
boot.ppfst(myData.hfstat)

basicstat$Ho #observed
write.csv(basicstat$Ho, "HO.csv")
basicstat$Hs #expected
write.csv(basicstat$Hs, "Hs.csv")
basicstat

###########################################
##vcftools Pi and TajimaD
#!/bin/sh
# .vcf file
# .pop file (unique names of pops, one per line)
# .map file (individual to population mapping file — 2 columns)

cat acth.pop | while read line;
do
 grep "$line" acth.map > $line.pop
done

for p in *.pop
do
 vcftools --vcf ACTH_no_DC.vcf --keep $p --site-pi --out $p
done

for p in *.pop
do
 vcftools --vcf ACTH_no_DC.vcf --keep $p --TajimaD 100 --out $p
done
Table 1. He, Ho, Fis, Pi, Tajima D
Pop He Ho Fis bootstrap Pi TajimaD
AH 0.0290100 0.0079142 0.3436019 0.6745-0.7944 0.039 0.487
AS 0.0282694 0.0103352 0.3675841 0.5742-0.6997 0.027 0.519
BM 0.0257414 0.0085403 0.4663113 0.5952-0.7598 0.025 1.030
BV 0.0665451 0.0290545 0.4839219 0.5366-0.5985 0.067 0.339
DH 0.0442434 0.0116198 0.5216591 0.6964-0.793 0.045 0.577
EW 0.0286705 0.0114555 0.5509849 0.5378-0.6771 0.028 0.307
FR 0.0266028 0.0104184 0.6025374 0.5263-0.7024 0.026 0.487
GB 0.0603681 0.0247369 0.6111170 0.5611-0.6198 0.061 0.311
HO 0.0522192 0.0235110 0.6493415 0.5246-0.5798 0.052 0.252
JC 0.0595809 0.0137875 0.6705526 0.7373-0.8046 0.058 0.752
LV 0.0752395 0.0120090 0.6750268 0.8095-0.8687 0.074 0.512
MD 0.0584788 0.0158579 0.7041943 0.6963-0.7655 0.057 0.483
PL 0.0498516 0.0323144 0.7147738 0.3182-0.3935 0.054 0.787
PT 0.0359979 0.0244180 0.7190848 0.275-0.3682 0.041 0.183
SC 0.0304708 0.0099798 0.7279870 0.6134-0.7366 0.029 0.536
SS 0.0607011 0.0237458 0.7397952 0.5822-0.6382 0.067 0.273
VM 0.0497299 0.0166727 0.8160541 0.6302-0.6925 0.054 0.401
Table 2. Fst and bootstrap values
X AH AS BM BV DH EW FR GB HO JC LV MD PL PT SC SS VM
AH 0 0.02455154 0.03599737 0.18811044 0.3030873 0.30585451 0.03718416 0.22277811 0.04990678 0.21530206 0.22555167 0.21965323 0.24592747 0.23571014 0.03126722 0.20184171 0.22456606
AS 0.02455154 0 0.03502803 0.1904012 0.30504707 0.30627969 0.03473407 0.22735316 0.04884289 0.21750813 0.22500642 0.22319446 0.2429262 0.23499405 0.02906909 0.2044423 0.22568765
BM 0.03599737 0.03502803 0 0.18914591 0.30690705 0.30378172 0.04148891 0.22492701 0.05654367 0.21648762 0.22293814 0.22145021 0.24137825 0.23400018 0.03465313 0.2061594 0.22189048
BV 0.18811044 0.1904012 0.18914591 0 0.26644572 0.26374995 0.18950041 0.15383321 0.16270071 0.15675067 0.15997019 0.15148236 0.18241019 0.17326131 0.19038993 0.13987772 0.16349472
DH 0.3030873 0.30504707 0.30690705 0.26644572 0 0.22791262 0.30498922 0.27079827 0.27355581 0.2655081 0.25989522 0.26784242 0.2480049 0.24399845 0.30764996 0.25224443 0.23866901
EW 0.30585451 0.30627969 0.30378172 0.26374995 0.22791262 0 0.30574017 0.25745074 0.27954705 0.25867156 0.24805388 0.2573591 0.22804195 0.22611793 0.31008221 0.24726832 0.21869693
FR 0.03718416 0.03473407 0.04148891 0.18950041 0.30498922 0.30574017 0 0.22581964 0.05570926 0.21826993 0.22539898 0.22292399 0.24455783 0.23669091 0.03549687 0.20425808 0.22612998
GB 0.22277811 0.22735316 0.22492701 0.15383321 0.27079827 0.25745074 0.22581964 0 0.18854945 0.07576912 0.07746855 0.02303962 0.14299674 0.12445317 0.22746994 0.05334368 0.12274054
HO 0.04990678 0.04884289 0.05654367 0.16270071 0.27355581 0.27954705 0.05570926 0.18854945 0 0.17782536 0.18862437 0.1829804 0.20898121 0.20334486 0.05336586 0.16626241 0.19090056
JC 0.21530206 0.21750813 0.21648762 0.15675067 0.2655081 0.25867156 0.21826993 0.07576912 0.17782536 0 0.09195017 0.06826867 0.14570492 0.11813274 0.2184876 0.03048189 0.12199715
LV 0.22555167 0.22500642 0.22293814 0.15997019 0.25989522 0.24805388 0.22539898 0.07746855 0.18862437 0.09195017 0 0.07182569 0.08767191 0.09677359 0.22696216 0.07398454 0.08750547
MD 0.21965323 0.22319446 0.22145021 0.15148236 0.26784242 0.2573591 0.22292399 0.02303962 0.1829804 0.06826867 0.07182569 0 0.13883699 0.12058578 0.22360967 0.04531911 0.11662562
PL 0.24592747 0.2429262 0.24137825 0.18241019 0.2480049 0.22804195 0.24455783 0.14299674 0.20898121 0.14570492 0.08767191 0.13883699 0 0.09169221 0.24367078 0.12793599 0.05839492
PT 0.23571014 0.23499405 0.23400018 0.17326131 0.24399845 0.22611793 0.23669091 0.12445317 0.20334486 0.11813274 0.09677359 0.12058578 0.09169221 0 0.23544357 0.10472331 0.0950573
SC 0.03126722 0.02906909 0.03465313 0.19038993 0.30764996 0.31008221 0.03549687 0.22746994 0.05336586 0.2184876 0.22696216 0.22360967 0.24367078 0.23544357 0 0.20569611 0.22687392
SS 0.20184171 0.2044423 0.2061594 0.13987772 0.25224443 0.24726832 0.20425808 0.05334368 0.16626241 0.03048189 0.07398454 0.04531911 0.12793599 0.10472331 0.20569611 0 0.10312286
VM 0.22456606 0.22568765 0.22189048 0.16349472 0.23866901 0.21869693 0.22612998 0.12274054 0.19090056 0.12199715 0.08750547 0.11662562 0.05839492 0.0950573 0.22687392 0.10312286 0
Bootstrap values
AH AS BM BV DH EW FR GB HO JC LV MD PL PT SC SS VM
AH NA 0.4381463 0.5548054 0.7582905 0.862705 0.8940217 0.5501907 0.8009213 0.514907 0.7985442 0.7745976 0.8037476 0.8236019 0.836636 0.4886056 0.7679713 0.8052715
AS 0.3841946 NA 0.5683157 0.7764327 0.8707072 0.9011536 0.5582013 0.8141863 0.5327442 0.809954 0.787304 0.8176951 0.8319955 0.8476897 0.4940938 0.7821249 0.8200815
BM 0.4983713 0.5046686 NA 0.7889293 0.8795135 0.9053105 0.6133679 0.8249111 0.5876682 0.8207844 0.7983388 0.8267424 0.8389642 0.8558063 0.548618 0.7948679 0.8306031
BV 0.7352616 0.753893 0.7636459 NA 0.7847021 0.8168635 0.7777854 0.6616048 0.6972369 0.676409 0.6494697 0.6726021 0.7005897 0.7007159 0.7752929 0.6126693 0.6644014
DH 0.8452166 0.8563767 0.8622362 0.763539 NA 0.8341505 0.8738641 0.7957134 0.8133675 0.7977939 0.765235 0.8017703 0.7861553 0.802818 0.8688137 0.7626494 0.7764742
EW 0.8783854 0.8859661 0.8888133 0.7987098 0.8156641 NA 0.9054352 0.8239031 0.8526823 0.8286562 0.7977545 0.8305635 0.8155849 0.8388079 0.8994022 0.8025277 0.809619
FR 0.5085393 0.4997065 0.5565828 0.7571045 0.8603357 0.8915238 NA 0.8175928 0.5720465 0.8161523 0.7915404 0.8196891 0.8375722 0.8524922 0.5488602 0.7857988 0.8238386
GB 0.7758175 0.7930538 0.8014575 0.6314688 0.7760831 0.8065095 0.7996026 NA 0.7328556 0.5156367 0.4816726 0.2362744 0.6599499 0.6464671 0.8129906 0.3874461 0.6150905
HO 0.4703591 0.4873377 0.538907 0.6695532 0.7933297 0.8340722 0.5301681 0.7073132 NA 0.7320288 0.7099636 0.738091 0.7535495 0.7648777 0.5461877 0.6824895 0.7313526
JC 0.7749871 0.7888573 0.7983775 0.6505446 0.7770517 0.8115683 0.7947513 0.4791071 0.7029462 NA 0.5353314 0.5074439 0.6719503 0.6421238 0.8088347 0.2779044 0.6244581
LV 0.7537827 0.7668715 0.7785227 0.6242164 0.7451415 0.7789623 0.7739154 0.4419769 0.6898891 0.5062993 NA 0.4797075 0.5230145 0.5487319 0.7882241 0.4353739 0.5017379
MD 0.7794006 0.7969911 0.8036258 0.6432901 0.7804947 0.813464 0.801387 0.1992173 0.7103849 0.4662691 0.4421077 NA 0.6662835 0.6500669 0.8160606 0.3664239 0.6184875
PL 0.800303 0.8121095 0.8184893 0.6688832 0.7600137 0.7966814 0.8187304 0.6241127 0.7295041 0.6372993 0.4826797 0.6305185 NA 0.5884116 0.8282213 0.6136611 0.450323
PT 0.8124549 0.8266832 0.833119 0.6707638 0.7751408 0.816624 0.8352006 0.6091912 0.7420517 0.6035071 0.5130083 0.612439 0.5421181 NA 0.8425983 0.5812223 0.5877566
SC 0.441014 0.4322706 0.4930594 0.7528763 0.8536278 0.8853198 0.50524 0.7931312 0.5123621 0.788571 0.7686164 0.7946612 0.8073957 0.8229356 NA 0.7794287 0.8169956
SS 0.739587 0.7556143 0.7691368 0.5765702 0.7407466 0.7814624 0.7632292 0.3399881 0.6515144 0.230449 0.3971302 0.3163408 0.5734949 0.5312998 0.7555634 NA 0.5463611
VM 0.7825421 0.7959215 0.8040112 0.6376172 0.7493949 0.785401 0.8046526 0.5793391 0.7046233 0.5868335 0.465691 0.5791253 0.4075067 0.5425078 0.7940393 0.5055829 NA

PCA and UMAP

PCA following T. Faske:

library(data.table)
library(ggplot2)
library(ggsci)
library(umap)
library(LEA)
library(readr)
library(ggpubr)

##### PCA FUNCTION #####

##################### PCA_gen ##########################

#### PCA for 012 coded  vcf files
#### Following method in Patterson et al 2006
##### input files #####
### df_gen: genotypic data with individuals as rows and snps as coloumns.
###         Can include missing data. Either genotype probabilities or 012 format
### indv:  dataframe with rows corresponding to individuals in df_gen file 
###        Must have Pop and ID coloumn 
##### output files ####
### pca_out: 
### $ 'pca_df': dataframe with rows as individuals and coloumns as PC1-30, Pop, ID
### $ 'pve': list of proportion of variance explained for each PC

############# function ######################

PCA_gen <- function(df_gen,indv,num=5){ #add ggplot, add tw, add # of output
  #pkgTest("ggplot2")
  df_gen <- apply(df_gen, 2, function(df) gsub(-1,NA,df,fixed=TRUE))
  df_gen <- apply(df_gen, 2, function(df) as.numeric(df))
  colmean<-apply(df_gen,2,mean,na.rm=TRUE)
  normalize<-matrix(nrow = nrow(df_gen),ncol=ncol(df_gen))
  af<-colmean/2
  for (m in 1:length(af)){
    nr<-df_gen[ ,m]-colmean[m]
    dn<-sqrt(af[m]*(1-af[m]))
    normalize[ ,m]<-nr/dn
  }
  
  normalize[is.na(normalize)]<-0
  method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
  pve <- summary(method1)$importance[2,]
  print(pve[1:5])
  if(nrow(df_gen) < num){
    num <- nrow(df_gen)
  }
  
  pca_X<- method1$x[,1:num]
  pca_X <- as.data.frame(pca_X)
  pca_X$Pop <- indv$Pop
  pca_X$ID <- indv$ID
  pca_out <- list("pca_df"=pca_X,"pve"=pve)
  #print(PCA_fig(pca_out))
  return(pca_out)
}

######################## PCA_fig() ###################

PCA_fig <- function(pca_out,fill="Pop"){ #add PCA setting or normal
  xlab <- paste("PCA1 (",round(pca_out$pve[1]*100,2),"%)",sep="")
  ylab <- paste("PCA2 (",round(pca_out$pve[2]*100,2),"%)",sep="")
  pca_df <- pca_out$pca_df
  filler <- as.character(pca_df[[fill]])
  ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=filler)) + 
    geom_point(pch=21,colour='black',size = 3)+ 
    xlab(xlab) + ylab(ylab) +
    theme_bw() + 
    theme(#legend.position = 'none',
      axis.text = element_text(size=13), 
      axis.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
      panel.border = element_rect(size = 1.5, colour = "black"),
      legend.text = element_text(size=13),
      legend.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())
}


######################################################################################
######################################################################################
#### START HERE #####
######################################################################################
######################################################################################
###### MAKE PCA 

df012<-fread("variantes012.012",sep="\t", data.table=F) 
df012 <- df012[,-1]
Pop_ID_Sum <- read.csv('Pop_ID.csv')

############# PCA ######################

pca_out <- PCA_gen(df012,Pop_ID_Sum)
pve <- pca_out$pve[1:5]
pve
#    PC1     PC2     PC3     PC4     PC5 
#0.15041 0.11894 0.11241 0.08590 0.05601 
pca_df <- pca_out$pca_df[,1:5]
pca_df <- cbind(Pop_ID_Sum,pca_df)

write.csv(pca_df,'pca_df.csv',row.names = FALSE)

############## PLOT ##################

pca_df <- read.csv("pca_df.csv")
pve <- c(0.15041, 0.11894, 0.11241, 0.08590, 0.05601)

#### Pop ####

PCA1VS2 <- ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=as.character(Pop))) + 
  geom_point(pch=21,colour='black',size = 4)+ #ggtitle("PCA ACTH") +
  xlab(paste("PC",1," (",pve[1]*100,"%)",sep="")) + ylab(paste("PC",2," (",pve[2]*100,"%)",sep=""))+
  scale_fill_d3(name='Pop:',palette = 'category20') + 
  theme_bw() + 
  theme(legend.position = 'none',
        axis.text = element_text(size=11), 
        axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
        panel.border = element_rect(size = 1.5, colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave("pca1vs2_2.pdf",PCA1VS2,height=8,width = 12,units = 'in')


PCA2VS3 <- ggplot(data = pca_df, aes(x=PC2,y=PC3,fill=as.character(Pop))) + 
  geom_point(pch=21,colour='black',size = 4) +
  xlab(paste("PC",2," (",pve[2]*100,"%)",sep="")) + ylab(paste("PC",3," (",pve[3]*100,"%)",sep=""))+
  scale_fill_d3(name='Pop:',palette = 'category20') + 
  theme_bw() + 
  theme(legend.position = 'none',
        axis.text = element_text(size=11), 
        axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
        panel.border = element_rect(size = 1.5, colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave("pca2vs3.pdf",PCA2VS3,height=8,width = 12,units = 'in')


PCA3VS4 <- ggplot(data = pca_df, aes(x=PC3,y=PC4,fill=as.character(Pop))) + 
  geom_point(pch=21,colour='black',size = 4)+   #ggtitle("PCA") +
  xlab(paste("PC",3," (",pve[3]*100,"%)",sep="")) + ylab(paste("PC",4," (",pve[4]*100,"%)",sep=""))+
  scale_fill_d3(name='Pop:',palette = 'category20') + 
  theme_bw() + 
  theme(legend.position = 'none',
        axis.text = element_text(size=11), 
        axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
        panel.border = element_rect(size = 1.5, colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave("pca3vs4.pdf",PCA3VS4,height=8,width = 12,units = 'in')

-1. PCA1 vs PCA2
-2. PCA2 vs PCA3
-3. PCA3 vs PCA4

#### UMAP
custom.settings = umap.defaults
min_dist = c(0.1, 0.25, 0.50, 0.80, 0.99)
n_neighbors = c(2,4,6,8,10,12,14,16)
cols <- c("#1773b3ff", "#ff7d02ff","#219c21ff","#d41c1cff","#e373c2ff","#b8bd17ff","#17bdcfff","#aec7e6ff","#ffb878ff","#97de8aff","#ff9794ff","#c5b0d4ff","#c29c94ff","#f5b3cfff","#c7c7c7ff","#d9d98aff","#9cd9e3ff") 
#col17 <- pal_d3(palette='category20')(20)
custom.settings$n_neighbors <-n_neighbors[1] 
myplots <- list()
# library(scater)
for (i in 1:length(min_dist)){
  rm(umap_g)
  rm(umap_g_plot)
  custom.settings$min_dist<-min_dist[i]
  umap_g <- as.data.frame(umap(df012,config = custom.settings)$layout )
  names(umap_g) <- c('layout1','layout2')
  umap_g <- cbind(Pop_ID_Sum,umap_g)

  #now plotting
  myplots[[i]]<-ggplot(data = umap_g, aes(x=layout1,y=layout2,fill=as.character(Pop))) +
    geom_point(colour='black',size = 4,pch=21) + ggtitle(paste0("UMAP n_neighbors 16 min_dist ",min_dist[i])) +
    xlab('Layout 1') + ylab('Layout 2') +
    scale_fill_manual(values= cols)+ theme_bw() +
    theme(legend.position = 'none', 
      plot.title = element_text(size = 8, colour="black"),
      axis.text = element_text(size=6),
          axis.title = element_text(size = 8, colour="black",face = "bold"),
          panel.border = element_rect(size = 1.5, colour = "black"),
      #legend.title = element_text(size = 6, colour="black",face = "bold",vjust = 1),
      #legend.text = element_text(size=3),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) 
  }

x <- gridExtra::grid.arrange(grobs = myplots)
ggsave("/home/caro/Escritorio/ACTH/ACTH no DC/PCA UMAP/neighb_.pdf",x,height=10,width = 8,units = 'in')

-1. Supplementary UMAP

IBD, EEMS, un-PC

IBD:
#############
library(ape)
library(ade4)
library(plyr)
library(vegan)
library(ggplot2)
library(dartR)

####IBD test
gen <- read.csv("pairwise.csv", header = TRUE)
Dgen <-as.matrix(gen)[, -1]
Dgen

coor <- read.csv("lat_lon.csv", header = TRUE)
latlong <- dist(cbind(coor$Lat, coor$Long))
Dgeo  <- as.matrix(latlong)[1:17, 1:17]
Dgeo

# mantel_vegan  <- mantel(Dgen, Dgeo, method = "spearman", permutations = 9999, na.rm = TRUE)
# mantel_vegan
# plot(Dgen, Dgeo)

Dgen_2<-as.matrix(as.dist(Dgen))
gl.ibd(x = NULL,Dgen = Dgen_2,
       Dgeo = Dgeo,
       permutations = 999999,
       plot = TRUE)

Results:

#Mantel statistic based on Pearson's product-moment correlation 
Call:
vegan::mantel(xdis = Dgen, ydis = Dgeo, permutations = 999, na.rm = TRUE) 

Mantel statistic r: 0.4163 
      Significance: 0.002 

Upper quantiles of permutations (null model):
  90%   95% 97.5%   99% 
0.144 0.181 0.216 0.294 
Permutation: free
Number of permutations: 999
EEMS:
#############

EEMS
####
https://github.com/dipetkov/eems.git
#This repository contains an implementation of the EEMS method for analyzing and visualizing spatial population structure from geo-referenced genetic samples
Download Eigen 3.2.2 (Eigen does not requires installation)
Download Boost 1.57 (other version might not be compatibles)

For Boost installation run the following:

sed -e '1 i#ifndef Q_MOC_RUN' \
    -e '$ a#endif'            \
    -i boost/type_traits/detail/has_binary_operator.hpp &&

./bootstrap.sh --prefix=/usr &&
./b2 stage threading=multi link=shared

sudo su ./b2 install threading=multi link=shared
After that, go to the Makefile from the runeems_snps/src file and modify EIGEN_INC, BOOST_LIB, and BOOST_INC

The run, make linux

EEMS for SNPs: Three input files are required, data.diffs, data.coord, data.outer

data.diffs: the matrix of average pairwise genetic differences. This can be computed with bed2diffs (check before running it the differences among both version of bed2diffs availables)
Go to the bed2diffs/src folder

bed2diffs uses the libplinkio library to read genotype data stored in plink binary format. To install libplinkio, first clone the GitHub repository and get the latest version (commit 781e9ee37076)

git clone https://github.com/mfranberg/libplinkio
cd libplinkio
git checkout 781e9ee37076
Install libplinkio to a custom location /path/to/plinkio

../configure --prefix=/path/to/plinkio
make && make check && make install
Update the PLINKIO location in the Makefile, both in the src and in the src-without-openmp directories
Compile using make linux

Usage (in the src folder):

./bed2diifs_v2 --bfile ./nameofthefiles* --nthreads 2
*name of the bed fam and ped files without the extension
*to get the dissimilarity matrix it is important running the bed2diifs_v2, not the bed2diifs_v1
  
Then you got the dissimilarity matrix. Load in R:

diffs <- read.table("./test/example-SNP-major-mode.diffs")
diffs
#the matrix should be a nonnegative, symmetric with zeros on the diagonal 
data.coord: the sample coordinates (two coordinates per sample, one sample per line)
datapath.outer: the habitat coordinates (as a sequence of vertices that outline a closed polygon). You can get these coordinates in Google Maps API v3 Tool
Then, EEMS requires a configuration file with "ini" extension, for example:

datapath = ./data/inputdata
mcmcpath = ./data/outputdata
nIndiv = 246
nSites = 5677
nDemes = 300
diploid = false
numMCMCIter = 200000000
numBurnIter = 100000000
numThinIter = 999999

./runeems_snps --params configurationfile.ini --seed 123 (randome seed, optional)
./runeems_snps --params configurationfile2.ini --seed 123 
./runeems_snps --params configurationfile3.ini --seed 123 

EEMS results visualization:

#install_github("dipetkov/eems/plotting/rEEMSplots")
library(rEEMSplots)

mcmcpath <- c("./eems_output/output_1", "./eems_output/output_2", "./eems_output/output_3")
plotpath = (""./eems_output/plots"")

eems.plots(mcmcpath, plotpath, longlat = TRUE, add.grid = TRUE, add.demes = TRUE, add.outline = TRUE, col.outline = "black", lwd.outline = 2, col.demes = "red", pch.demes =5, min.cex.demes = 0.5, max.cex.demes =1.5)
un-PC:
############################################################
### unPC: uses the principal components from any PCA method applied to genetic data in combination with geographic coordinates of the samples to create visualizations of geneticdifferentiation across the landscape.

### Un-PC first calculates the Euclidean distance between the PCA coordinates for each pair of populations to generate a PCA-based genetic distance. The pairwise geographic distance between populations is also calculated.

### The ratio of the genetic distance to the geographic distance for each pair of pop. is the un-PC value for that pair. Larger un-PC values (larger genetic distance relative to geographic distance) indicate increased genetic differentiation between the populations, while smaller un-PC values indicate less genetic differentiation.

### Ellipses with un-PC values near the mean of the distribution are colored white, while ellipses with higher un-PC values (greater differentiation) are colored progressively more pink, and ellipses with lower un-PC values (less differentiation) are colored progressively more green.

devtools::install_github("hahnlab/un-PC")
library(unPC)

unPC(inputToProcess  = "/home/caro/Escritorio/ACTH/ACTH no DC/unPC def/pca_eigen.evec", outputPrefix = "unPC",
     runDataImport = TRUE, runPairwiseCalc = TRUE, geogrCoords= "/home/caro/Escritorio/ACTH/ACTH no DC/unPC def/Lon_Lat.txt",
     roundEarth = TRUE, firstPC = 1, secondPC = 2, runPlotting = TRUE,
     applyManualColor = FALSE, geogrCoordsForPlotting = "/home/caro/Escritorio/ACTH/ACTH no DC/unPC def/Lon_Lat_popnAggregated.txt", plotWithMap = FALSE,
     ellipseWidth = 0.05, populationPointNormalization = 7,
    runAggregated = TRUE, savePlotsToPdf = TRUE)

unpcout <- readRDS("/home/caro/Escritorio/ACTH/ACTH no DC/unPC def/unPC_pairwiseDistCalc_unPC.rds")
unPC <- unpcout$ratioPCToGeogrDist # <- here are the unPC scores!
write.csv(unpcout, "unPCscores.csv")

############################################################

Admixture

Running admixture:

vcftools --vcf acth.vcf --plink-tped --out acth
plink --tped acth.tped --tfam acth.tfam --make-bed --out acth

for i in {2..17}
do
 ./admixture --cv acth.bed $i > log${i}.out
done

grep "CV" *out | awk '{print $3,$4}' | sed -e 's/(//;s/)//;s/://;s/K=//'  > binary.cv.error
Rscript admixture.R -p PH -i individual.populations.txt -k 8 -m 8 -l pop names

Phylogenetic analyses

Convert vcf format to fasta using vcf2phylip.py:

python vcf2phylip.py --i ACTH.VCF --fasta

Trimming the alignment with trimAl:

./trimal -in acth.fasta -gappyout -out acth_fasta_trimmed.phy

Run iqtree:

./iqtree -s acth_fasta_trimmed.phy -nt 4 -st DNA -m MFP -bb 1000
library(phytools)
packageVersion("phytools")

# Latitude and longitude coordinates
coords<- read.csv("lat_lon.csv")
# Tree file
tree<-read.tree("acth_fasta_trimmed.phy.treefile")
rownames(coords)<-coords$Loc
coords<-coords[,-1]
library(mapdata)
library(viridis)
plot(tree)

# Drop tree (simplify the tree to get one sample per population, specify the tip to remove)
tree_drop <-drop.tip(tree, tip=c("ATAH02","ATAH03","ATAH04","ATAH05","ATAH06","ATAH07","ATAH08","ATAH09","ATAH10","ATAH12","ATAH13","ATAH14","ATAH15","ATAS02","ATAS03","ATAS04","ATAS05","ATAS06","ATAS07","ATAS08","ATAS09","ATAS10","ATAS11","ATAS12","ATAS13","ATAS14","ATAS15","ATBM10","ATBM11","ATBM12","ATBM13","ATBM14","ATBM15","ATBM16","ATBM17","ATBM18","ATBM2","ATBM3","ATBM4","ATBM5","ATBM6","ATBM7","ATBM8","ATBM9","ATBV02","ATBV03","ATBV04","ATBV05","ATBV06","ATBV07","ATBV08","ATBV09","ATBV10","ATBV11","ATBV12","ATBV13","ATBV14","ATBV15","ATDH02","ATDH03","ATDH04","ATDH05","ATDH06","ATDH07","ATDH08","ATDH09","ATDH10","ATDH12","ATDH13","ATDH14","ATDH15","ATEW02","ATEW03","ATEW04","ATEW05","ATEW06","ATEW07","ATEW08","ATEW09","ATEW10","ATEW11","ATEW12","ATEW13","ATEW14","ATEW15","ATFR02","ATFR03","ATFR04","ATFR05","ATFR06","ATFR07","ATFR08","ATFR09","ATFR10","ATFR11","ATFR12","ATFR13","ATFR14","ATGB02","ATGB03","ATGB05","ATGB06","ATGB07","ATGB08","ATGB09","ATGB10","ATGB11","ATGB12","ATGB13","ATGB14","ATGB15","ATHO02","ATHO03","ATHO04","ATHO05","ATHO06","ATHO07","ATHO08","ATHO09","ATHO10","ATHO11","ATHO12","ATHO13","ATHO14","ATHO15","ATJC02","ATJC03","ATJC04","ATJC05","ATJC06","ATJC07","ATJC08","ATJC09","ATJC10","ATJC11","ATJC12","ATJC13","ATJC14","ATJC15","ATLV02","ATLV04","ATLV05","ATLV06","ATLV07","ATLV08","ATLV09","ATLV10","ATLV11","ATLV12","ATLV13","ATLV14","ATLV15","ATMD02","ATMD03","ATMD04","ATMD05","ATMD06","ATMD07","ATMD08","ATMD09","ATMD10","ATMD11","ATMD12","ATMD13","ATMD14","ATMD15","ATPL02","ATPL03","ATPL04","ATPL05","ATPL06","ATPL07","ATPL08","ATPL09","ATPL10","ATPL11","ATPL12","ATPL13","ATPL14","ATPL15","ATPT02","ATPT03","ATPT04","ATPT05","ATPT06","ATPT07","ATPT08","ATPT09","ATPT10","ATPT11","ATPT12","ATSC02","ATSC03","ATSC04","ATSC05","ATSC06","ATSC07","ATSC08","ATSC09","ATSC10","ATSC11","ATSC12","ATSC13","ATSC14","ATSC15","ATSS02","ATSS03","ATSS04","ATSS05","ATSS06","ATSS07","ATSS08","ATSS09","ATSS11","ATSS12","ATSS13","ATSS14","ATSS15","ATVM03","ATVM04","ATVM05","ATVM08","ATVM09","ATVM10","ATVM11","ATVM12","ATVM13","ATVM14","ATVM15"), trim.internal = TRUE, subtree = FALSE,
                                 root.edge = 0,collapse.singles = TRUE,
                                 interactive = FALSE)
                     

tree_drop$tip.label
# Load the database "state" and specify which states would be pictured
obj<-phylo.to.map(tree_drop,coords,database="state",
                  regions=c("Nevada","California","Oregon"),plot=F)

plot(obj,direction="rightwards",colors=cols,xlim=c(-100,100),ylim=c(-100,100))
plot(obj,type="phylogram",asp=1)

AMOVA and LD

AMOVA: 79.66 % of the observed genetic variance is explained by variation between populations

library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
library("poppr")
library("magrittr")
library(ape)

# vcf to genind
setwd("/home/caro/Escritorio/ACTH/ACTH no DC/")
ACTH.VCF <- read.vcfR("ACTH_no_DC.vcf")
my_genind <- vcfR2genind(ACTH.VCF)

### Give a data set with stratification (individual, populations, subpopulations..)
file.hier = read.table("ind_pop.txt", header=T)
strata(my_genind) = file.hier
my_genind

## amova with stratifications
amova.res.95 = poppr.amova(acth.hier.strat, ~pop/subpop, cutoff=0.95)
amova.res.95
write.table(amova.res.95$results, sep = ",", file = "results_amova.csv")
write.table(amova.res.95$componentsofcovariance, sep = ",", file = "Components_covariance.csv")
write.table(amova.res.95$statphi, sep = ",", file = "phi.csv")

## To test if populations are significantly different
amova.test.95 = randtest(amova.res.95)
amova.test.95
# AMOVA results
> amova.res.95
$call
ade4::amova(samples = xtab, distances = xdist, structures = xstruct)

$results
                 Df    Sum Sq    Mean Sq
Between samples  16 203377.59 12711.0993
Within samples  229  50468.42   220.3861
Total           245 253846.01  1036.1062

$componentsofcovariance
                                Sigma         %
Variations  Between samples  863.6075  79.66906
Variations  Within samples   220.3861  20.33094
Total variations            1083.9936 100.00000

$statphi
                        Phi
Phi-samples-total 0.7966906

> amova.test.95
Monte-Carlo test
Call: as.randtest(sim = res, obs = sigma[1])

Observation: 863.6075 

Based on 99 replicates
Simulated p-value: 0.01 
Alternative hypothesis: greater 

    Std.Obs Expectation    Variance 
  93.237889   -1.234629   86.037702 
library("poppr")
library("magrittr")

AH <- popsub(x, "1")
AH %>% clonecorrect(strata= ~ind/pop) %>% ia(sample = 999)

AS <- popsub(x, "2")
AS %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

BM <- popsub(x, "3")
BM %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

BV <- popsub(x, "4")
BV %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

DH <- popsub(x, "6")
DH %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

EW <- popsub(x, "7")
EW %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

FR <- popsub(x, "8")
FR %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

GB <- popsub(x, "9")
GB %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

HO <- popsub(x, "10")
HO %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

JC <- popsub(x, "11")
JC %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

LV <- popsub(x, "12")
LV %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

MD <- popsub(x, "13")
MD %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

PL <- popsub(x, "14")
PL %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

PT <- popsub(x, "15")
PT %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

SC <- popsub(x, "16")
SC %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

SS <- popsub(x, "17")
SS %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

VM <- popsub(x, "18")
VM %>% clonecorrect(strata= ~pop) %>% ia(sample = 999)

Table 3. LD results: rd and Ia significant in all cases

Pop rd p value Ia p.1 value.1 Ia.1
AH 0.2830 0.001 122.350 0.001 0.0290100 0.0079142 0.3436019
AS 0.2260 0.001 93.120 0.001 0.0282694 0.0103352 0.3675841
BM 0.1860 0.001 58.549 0.001 0.0257414 0.0085403 0.4663113
BV 0.0190 0.001 21.443 0.001 0.0665451 0.0290545 0.4839219
DH 0.1370 0.001 87.699 0.001 0.0442434 0.0116198 0.5216591
EW 0.3140 0.001 148.414 0.001 0.0286705 0.0114555 0.5509849
FR 0.2710 0.001 106.050 0.001 0.0266028 0.0104184 0.6025374
GB 0.0900 0.001 89.900 0.001 0.0603681 0.0247369 0.6111170
HO 0.0250 0.001 22.540 0.001 0.0522192 0.0235110 0.6493415
JC 0.1020 0.001 86.020 0.001 0.0595809 0.0137875 0.6705526
LV 0.2360 0.001 270.389 0.001 0.0752395 0.0120090 0.6750268
MD 0.0470 0.001 43.020 0.001 0.0584788 0.0158579 0.7041943
PL 0.1800 0.001 121.440 0.001 0.0498516 0.0323144 0.7147738
PT 0.1326 0.001 81.633 0.001 0.0359979 0.0244180 0.7190848
SC 0.3290 0.001 143.590 0.001 0.0304708 0.0099798 0.7279870
SS 0.0290 0.001 29.786 0.001 0.0607011 0.0237458 0.7397952
VM 0.0840 0.001 64.681 0.001 0.0497299 0.0166727 0.8160541

RDA

############## RDA
library(adegenet)
library(poppr)
library(tidyverse)
library(vcfR)
library(hierfstat)
library(pegas)
library(magrittr)
library(ape)
library(psych)
library(adespatial)
library(vegan)

# Load the genetic data (012) and make sure that not NAs are present
setwd("~/Escritorio/ACTH/ACTH no DC/RDA")
gen <- read.csv("acth_012.csv", row.names=1)
dim(gen)

# Env. variables
Env <- read.csv("Pop_prism_annual_ind.csv", header = TRUE)
str(env) 
# Make ID and Pop characters
Env$ID <- as.character(env$ID) 
Env$Pop <- as.character(env$Pop) 
# Genotipes and environmental in same order
identical(rownames(gen.imp), Env[,1]) 

# Coordinates and pop structure variables
Coordinates <- read.csv("Lat_Lon_sin.csv")
PopStruct <- read.csv("pca1_pca2.csv")

## Table gathering all variables
Variables <- data.frame(Env, Coordinates, PopStruct[,-1])
Variables[1:5,]

##     ID Pop    ppt tdmean  tmax tmean  tmin vpdmax vpdmin     Lon      Lat       PC1      PC2
##1 ATAH01  AH 213.79  -4.83 17.52  8.35 -0.83  19.52   2.45 -117.16 39.60081 -58.73467 18.27712
##2 ATAH02  AH 213.79  -4.83 17.52  8.35 -0.83  19.52   2.45 -117.16 39.60081 -69.58830 21.96260
##3 ATAH03  AH 213.79  -4.83 17.52  8.35 -0.83  19.52   2.45 -117.16 39.60081 -68.97447 21.92826
##4 ATAH04  AH 213.79  -4.83 17.52  8.35 -0.83  19.52   2.45 -117.16 39.60081 -68.99343 21.60410
##5 ATAH05  AH 213.79  -4.83 17.52  8.35 -0.83  19.52   2.45 -117.16 39.60081 -65.59592 19.83864

## Null model
RDA0 <- rda(gen.imp ~ 1,  Variables) 
## Full model
RDAfull <- rda(gen.imp ~ tdmean+ tmax+ tmean + tmin+ vpdmax +vpdmin+ Lon + Lat + PC1+ PC2, Variables)
mod <- ordiR2step(RDA0, RDAfull, Pin = 0.01, R2permutations = 1000, R2scope = T)
mod$anova

##                 R2.adj Df    AIC       F Pr(>F)   
##+ PC1           0.21042  1 1455.4 66.2921  0.002 **
##+ PC2           0.32625  1 1417.4 42.9470  0.002 **
##+ Lat           0.36448  1 1404.0 15.6197  0.002 **
##+ tmean         0.40122  1 1390.3 15.8462  0.002 **
##+ Lon           0.42377  1 1381.9 10.4327  0.002 **
##+ tdmean        0.43822  1 1376.6  7.1711  0.002 **
##+ vpdmax        0.45405  1 1370.5  7.9305  0.002 **
##+ tmax          0.48063  1 1359.2 13.1808  0.002 **
##+ tmin          0.50816  1 1346.8 14.2678  0.002 **
##+ vpdmin        0.52803  1 1337.6 10.9325  0.002 **
##<All variables> 0.52803 


## Full model
pRDAfull <- rda(gen.imp ~ PC1 + PC2 + Lon + Lat + tmean + tdmean + vpdmax + tmax + tmin + vpdmin, Variables)
pRDAfull
RsquareAdj(pRDAfull)
anova(pRDAfull)

## Pure climate model
pRDAclim <- rda(gen.imp ~ tmean + tdmean + vpdmax + tmax + tmin + vpdmin + Condition(Lon + Lat + PC1 + PC2),  Variables)
pRDAclim
RsquareAdj(pRDAclim)
anova(pRDAclim)

## Pure neutral population structure model  
pRDAstruct <- rda(gen.imp ~ PC1 + PC2 + Condition(Lon + Lat + tmean + tdmean + vpdmax + tmax + tmin + vpdmin),  Variables)
pRDAstruct
RsquareAdj(pRDAstruct)
anova(pRDAstruct)

##Pure geography model 
pRDAgeog <- rda(gen.imp ~ Lon + Lat + Condition(tmean + tdmean + vpdmax + tmax + tmin + vpdmin + PC1 + PC2),  Variables)
pRDAgeog
RsquareAdj(pRDAgeog)
anova(pRDAgeog)
Partial.RDA.models Inertia R2 P Proportion.of.explainable.variance Proportion.of.total.variance
Full model: gen ~ env + structur + geog 255.1100 0.5472 0.001*** 1 0.5473
Pure climate: gen ~ env (structur + geog) 69.5400 0.1491 0.001*** 0.2725 0.1492
Pure structure: gen ~ structur (env + geog) 48.9400 0.1049 0.001*** 0.1918 0.1051
Pure geography: gen ~ geog (env + structur) 22.3200 0.0478 0.001*** 0.0874 0.0478
140.8000 _ _ _ _
Counfounded climate/structure/geography 114.3100 _ _ 0.448 _
Total unexplained 211.0222 _ _ _ _
Total inertia 466.1322 _ _ _ _